# coding: utf-8 
# @Time : 2021/5/13 14:24
# @Author : shenshaoxiong
# @FileName: GetURL2db.py
# @Email   : 765105236@qq.com

from .Constant import HOST, USER, PASSWORD, DB, PORT
from lxml import etree
from .Funcs import get_response
from .Funcs import get_mysql_data
from .Funcs import insert_mysql
from .Funcs import get_md5
from .Funcs import get_now_time
import time


def get_url_2_db(source=""):
    filter = "source='%s'" % source
    select_field_list = ["source", "url"]
    task_data_list = get_mysql_data(host=HOST, user=USER, password=PASSWORD, db=DB, port=PORT, table="news_base",
                                    field_list=select_field_list, size=None, filter=filter)
    insert_table = "news_url_list"
    insert_vaules_list = []
    insert_field_list = ["id", "source", "url", "insert_time", "state"]
    for task_data in task_data_list:
        url = task_data["url"]
        response = get_response(url)
        html = response["text"]
        page = etree.HTML(html)
        ps = page.xpath('//a/@href')
        url_set = set()
        for p in ps:
            if p[0:4] == "http" and p[-1] != "/":
                url_set.add(p)
        url_list = list(url_set)
        for url in url_list:
            id = get_md5(url)
            insert_time = get_now_time()
            state = 0
            insert_vaules_list.append((id, source, url, insert_time, state))
    insert_mysql(host=HOST, user=USER, password=PASSWORD, db=DB, port=PORT, table=insert_table,
                 field_list=insert_field_list, values_list=insert_vaules_list)


# if __name__ == "__main__":
#     get_url_2_db(source="新浪国内")
